#!pip install -q pycountry
#!pip install gensim
#!pip install textblob
#!pip install wordcloud
#!pip install plotly
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
import math
import numpy as np
import scipy as sp
import pandas as pd
import pycountry
from sklearn import metrics
from sklearn.utils import shuffle
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import nltk
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import random
import networkx as nx
from pandas import Timestamp
import requests
from IPython.display import HTML
import seaborn as sns
from tqdm import tqdm
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
tqdm.pandas()
np.random.seed(0)
%env PYTHONHASHSEED=0
import warnings
warnings.filterwarnings("ignore")
biorxiv_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/biorxiv_clean.csv')
pmc_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/clean_pmc.csv')
comm_use_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/clean_comm_use.csv')
noncomm_use_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/clean_noncomm_use.csv')
papers_df = pd.concat([pmc_df,
biorxiv_df,
comm_use_df,
noncomm_use_df], axis=0).reset_index(drop=True)
papers_df
papers_df['authors']
full_table = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/Covid_19_clean/Covid_19_latest/covid_19_clean_complete.csv')
full_table.head(10)
# Converting Date column to datetime datatype
full_table.dtypes
full_table['Date'] = pd.to_datetime(full_table['Date'])
full_table.dtypes
# Checking for null values
full_table.isna().sum()
# filling missing values
full_table['Province/State'] = full_table['Province/State'].fillna('')
full_table.isna().sum()
# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')
# Creating a new column 'Active' which will represent all the present active cases
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']
full_table.head(10)
# Cases in the ships
ship = full_table[full_table['Province/State'].str.contains('Grand Princess')|full_table['Country/Region'].str.contains('Cruise Ship')]
# Let us seperate out only China data into variable China and all other countries_Region into variable row
# china and the row
china = full_table[full_table['Country/Region']=='China']
row = full_table[full_table['Country/Region']!='China']
# latest cases
full_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()
china_latest = full_latest[full_latest['Country/Region']=='China']
row_latest = full_latest[full_latest['Country/Region']!='China']
# latest condensed
full_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
china_latest_grouped = china_latest.groupby('Province/State')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
symptoms={'symptom':['Fever',
'Dry cough',
'Fatigue',
'Sputum production',
'Shortness of breath',
'Muscle pain',
'Sore throat',
'Headache',
'Chills',
'Nausea or vomiting',
'Nasal congestion',
'Diarrhoea',
'Haemoptysis',
'Conjunctival congestion'],'percentage':[87.9,67.7,38.1,33.4,18.6,14.8,13.9,13.6,11.4,5.0,4.8,3.7,0.9,0.8]}
symptoms=pd.DataFrame(data=symptoms,index=range(14))
symptoms
# Tree Plot
fig = px.treemap(symptoms, path=['symptom'], values='percentage',
color='percentage', hover_data=['symptom'],
color_continuous_scale='Rainbow')
fig.show()
# Pie Plot
fig = px.pie(symptoms,
values="percentage",
names="symptom",
template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()
# Creating a consolidated table , which gives the country wise total defined cases
temp = full_table.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered', 'Active'].max()
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp.style.background_gradient(cmap='Pastel1')
countries = full_table['Country/Region'].unique().tolist()
print(countries)
print("\nTotal countries affected by virus: ",len(countries))
1) Saves time when initially exploring your dataset
2) Makes it easy to modify and export your plot
3) Offers a more ornate visualization, which is well-suited for conveying the important insights hidden within your dataset
'''A Function To Plot Pie Plot using Plotly'''
def pie_plot(cnt_srs, colors, title):
labels=cnt_srs.index
values=cnt_srs.values
trace = go.Pie(labels=labels,
values=values,
title=title,
hoverinfo='percent+value',
textinfo='percent',
textposition='inside',
hole=0.7,
showlegend=True,
marker=dict(colors=colors,
line=dict(color='#000000',
width=2),
)
)
return trace
'''Plotly visualization'''
import plotly.offline as py
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
py.init_notebook_mode(connected = True) # Required to use plotly offline in jupyter notebook
py.iplot([pie_plot(full_table['Country/Region'].value_counts(), ['cyan', 'gold'], 'Country')])
full_grouped = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
# Over the time
fig = px.choropleth(full_grouped, locations="Country/Region", locationmode='country names', color=np.log(full_grouped["Confirmed"]),
hover_name="Country/Region", animation_frame=full_grouped["Date"].dt.strftime('%Y-%m-%d'),
title='Cases over time', color_continuous_scale=px.colors.sequential.Magenta)
fig.update(layout_coloraxis_showscale=False)
fig.show()
top = full_table[full_table['Date'] == full_table['Date'].max()]
top_casualities = top.groupby(by = 'Country/Region')['Confirmed'].sum().sort_values(ascending = False).head(20).reset_index()
top_casualities
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most confirmed cases" , fontsize = 30)
ax = sns.barplot(x = top_casualities['Confirmed'], y = top_casualities['Country/Region'])
for i, (value, name) in enumerate(zip(top_casualities['Confirmed'],top_casualities['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
Observations :
1) China was leading this from many days, but now they are controlling the pandemic spread.
2) The number of confirmed cases are on a high in the US, Italy, Spain, and France.
3) But the number of cases in the third world countries is less.
top_actives = top.groupby(by = 'Country/Region')['Active'].sum().sort_values(ascending = False).head(20).reset_index()
top_actives
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most active cases" , fontsize = 30)
ax = sns.barplot(x = top_actives['Active'], y = top_actives['Country/Region'])
for i, (value, name) in enumerate(zip(top_actives['Active'], top_actives['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
Observations :
1) As the covid-19 testing is increasing, The active number of cases is also increasing day by day.
2) The number of active cases is on a high in the US, Italy, Spain, and France.
top_deaths = top.groupby(by = 'Country/Region')['Deaths'].sum().sort_values(ascending = False).head(20).reset_index()
top_deaths
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most deaths" , fontsize = 30)
ax = sns.barplot(x = top_deaths['Deaths'], y = top_deaths['Country/Region'])
for i, (value, name) in enumerate(zip(top_deaths['Deaths'],top_deaths['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
Observations :
1) Even though Italy has the 2nd best healthcare system according to the WHO, they haven't been able to tackle the pandemic problem effectively.
2) China even having so many confirmed cases was able to decrease the number of deaths
3) The number of deaths is also on a rise, especially in Italy, Spain, and France
top_recovered = top.groupby(by = 'Country/Region')['Recovered'].sum().sort_values(ascending = False).head(20).reset_index()
top_recovered
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most recovered cases" , fontsize = 30)
ax = sns.barplot(x = top_recovered['Recovered'], y = top_recovered['Country/Region'])
for i, (value, name) in enumerate(zip(top_recovered['Recovered'],top_recovered['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
1) By far China was leading in the number of recoveries even though having a huge number of confirmed cases, but recently Germany has surpassed China in terms of most recovered cases. No wonder Germany has best healthcare facilities
2) Spain, US, Italy and Iran are also doing a good job.
3) We have to pump up these numbers for a promising future!
Now, we will look at the evolution of the virus in different countries and look at what strategies could be used to contain COVID-19.
The current situation (as of April 17th, 2020)
First, we will look at the current situation in five countries: Italy, China, US, Iran, and South Korea. (as of April 17th, 2020)
tbl = full_table.sort_values(by=["Country/Region", "Date"]).reset_index(drop=True)
tbl["Country"] = tbl["Country/Region"]
conts = sorted(list(set(tbl["Country"])))
dates = sorted(list(set(tbl["Date"])))
confirmed = []
for idx in range(len(conts)):
confirmed.append(tbl.query('Country == "{}"'.format(conts[idx])).groupby("Date").sum()["Confirmed"].values)
confirmed = np.array(confirmed)
def visualize_country(fig, cont, image_link, colors, step, xcor, ycor, done=True, multiple=False, sizex=0.78, sizey=0.2):
if not done:
showlegend = True
else:
showlegend = False
for idx, color in enumerate(colors):
fig.add_trace(go.Scatter(x=dates, y=confirmed[conts.index(cont)]-step*idx, showlegend=showlegend,
mode='lines+markers', name=cont,
marker=dict(color=colors[idx], line=dict(color='rgb(0, 0, 0)', width=0.5))))
fig.add_layout_image(
dict(
source=image_link,
xref="paper", yref="paper",
x=xcor, y=ycor,
sizex=sizex, sizey=sizey,
xanchor="right", yanchor="bottom")
)
title = "Confirmed cases in {}".format(cont) if done else "Confirmed cases"
if multiple: title = "Confirmed cases"
fig.update_layout(xaxis_title="Date", yaxis_title="Confirmed cases", title=title, template="plotly_white", paper_bgcolor="#f0f0f0")
if done:
fig.show()
fig = go.Figure()
visualize_country(fig, "Italy", "https://upload.wikimedia.org/wikipedia/en/0/03/Flag_of_Italy.svg", colors=["seagreen"], step=400, xcor=0.85, ycor=0.7)
fig = go.Figure()
visualize_country(fig, "China", "https://upload.wikimedia.org/wikipedia/commons/f/fa/Flag_of_the_People%27s_Republic_of_China.svg", colors=["red"], step=1000, xcor=0.85, ycor=0.65)
fig = go.Figure()
visualize_country(fig, "US", "https://upload.wikimedia.org/wikipedia/en/a/a4/Flag_of_the_United_States.svg", colors=["navy"], step=60, xcor=0.85, ycor=0.5)
fig = go.Figure()
visualize_country(fig, "Iran", "https://upload.wikimedia.org/wikipedia/commons/c/ca/Flag_of_Iran.svg", colors=["indianred"], step=175, xcor=0.8, ycor=0.6)
fig = go.Figure()
visualize_country(fig, "South Korea", "https://upload.wikimedia.org/wikipedia/commons/0/09/Flag_of_South_Korea.svg", colors=["dodgerblue"], step=80, xcor=0.95, ycor=0.4)
1)mass testing
2)GPS-based quarantine tracking to mitigate the virus.
Social distancing combined with thousands of tests everyday has reduced the number of new cases dramatically over the last few days.
The app, developed by the Ministry of the Interior and Safety, allows those who have been ordered not to leave home to stay in contact with case workers and report on their progress. It will also use GPS to keep track of their location to make sure they are not breaking their quarantine
fig = go.Figure()
visualize_country(fig, "Italy", "https://upload.wikimedia.org/wikipedia/en/0/03/Flag_of_Italy.svg", colors=["seagreen"], step=400, xcor=0.85, ycor=0.25, sizex=0.15, sizey=0.075, done=False)
visualize_country(fig, "US", "https://upload.wikimedia.org/wikipedia/en/a/a4/Flag_of_the_United_States.svg", colors=["navy"], step=60, xcor=0.980, ycor=0.8, sizex=0.1, sizey=0.065, done=False)
visualize_country(fig, "Iran", "https://upload.wikimedia.org/wikipedia/commons/c/ca/Flag_of_Iran.svg", colors=["indianred"], step=175, xcor=0.999, ycor=0.15, sizex=0.1, sizey=0.065, done=False)
visualize_country(fig, "South Korea", "https://upload.wikimedia.org/wikipedia/commons/0/09/Flag_of_South_Korea.svg", colors=["dodgerblue"], step=80, xcor=0.99, ycor=0.05, sizex=0.15, sizey=0.075, done=False)
fig.update_layout(showlegend=False)
visualize_country(fig, "China", "https://upload.wikimedia.org/wikipedia/commons/f/fa/Flag_of_the_People%27s_Republic_of_China.svg", colors=["red"], step=1000, xcor=0.6, ycor=0.2, sizex=0.15, sizey=0.075, multiple=True)
Lets see the measure taken by China to contain the virus
1) Wuhan was locked down on Jan 23rd 2020
2) Factories were closed across China on Feb 13th 2020
3) Complete (total) lockdown was imposed across China on Feb 20th 2020
fig = go.Figure()
visualize_country(fig, "China", "https://upload.wikimedia.org/wikipedia/commons/f/fa/Flag_of_the_People%27s_Republic_of_China.svg", colors=["red"], step=1000, xcor=0.85, ycor=0.65, done=False)
fig.add_shape(
dict(
type="line",
x0=Timestamp('2020-02-13 00:00:00'),
y0=50000,
x1=Timestamp('2020-02-13 00:00:00'),
y1=70000,
line=dict(
color="RoyalBlue",
width=5
)
))
fig.add_shape(
dict(
type="line",
x0=Timestamp('2020-02-20 00:00:00'),
y0=65000,
x1=Timestamp('2020-02-20 00:00:00'),
y1=85000,
line=dict(
color="Green",
width=5
)
))
fig.add_shape(
dict(
type="line",
x0=Timestamp('2020-01-23 00:00:00'),
y0=-10000,
x1=Timestamp('2020-01-23 00:00:00'),
y1=10000,
line=dict(
color="Orange",
width=5
)
))
fig.update_layout(title="Confirmed cases in China", showlegend=False)
fig.show()
We have plotted the number of new cases everyday in China above. The orange represents when Wuhan was locked down, the blue represents when factories were closed across China, and the green represents when complete (total) lockdown was imposed across China. Notice how the curve starts to flatten after the complete lockdown is imposed. Complete lockdown helps reduce community transmission and mitigate the virus.
Measures taken by South Korea to contain the virus:
1) South Korea ramped up testing on February 29th
2) Government introduced a new GPS-enabled quarantine tracking app on March 6th
fig = go.Figure()
visualize_country(fig, "South Korea", "https://upload.wikimedia.org/wikipedia/commons/0/09/Flag_of_South_Korea.svg", colors=["dodgerblue"], step=80, xcor=0.95, ycor=0.4, done=False)
fig.add_shape(
dict(
type="line",
x0=Timestamp('2020-02-29 00:00:00'),
y0=2000,
x1=Timestamp('2020-02-29 00:00:00'),
y1=4000,
line=dict(
color="purple",
width=5
)
))
fig.add_shape(
dict(
type="line",
x0=Timestamp('2020-03-06 00:00:00'),
y0=5500,
x1=Timestamp('2020-03-06 00:00:00'),
y1=7500,
line=dict(
color="deeppink",
width=5
)
))
fig.update_layout(title="Confirmed cases in Korea, South", showlegend=False)
fig.show()
We have plotted the number of new cases everyday in South Korea above. The purple represents when South Korea ramped up testing, and the pink represents the when a new GPS-enabled quarantine tracking app was deployed by the South government. These two measures have together worked to reduce community transmission and flatten curve towards the end of the first week of March.
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Word2Vec_Basic.PNG', width = 700, height = 800)
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Wrod2Vec_Relationships.PNG', width = 800, height = 700)
We can take advantage of these intricate relationships between word vectors to find cures for COVID-19. The steps are as follows:
Step 1 - Find common related to the study of COVID-19, such as "infection", "CoV", "viral", etc.
Step 2 - Find the words with lowest Euclidean distance to these words (most similar words).
Step 3 - Finally, find the words most similar to these words (second order similarity). These words will hopefully contain potential COVID-19 cures.
Note that the similarity between two Word2Vec vectors is calculated using the formula below (where u and v are the word vectors).
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Word2Vec_Formula.PNG', width = 500, height = 500)
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/NLP_three_Steps.PNG', width = 500, height = 500)
The approach detailed above is actually inspired by a research paper called "Unsupervised word embeddings capture latent knowledge from materials science literature", where the authors find new materials with desirable properties (such as thermoelectricity) solely based on a large corpus materials science literature. These materials were never used for these purposes before, but they outperform old materials by a large margin. I hope to emulate the same method to look for COVID-19 cures. The diagram below illustrates what the authors did in their research.
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Thermoelectricity.PNG', width = 500, height = 500)
In the diagram above, we can see that the authors found two levels of words similar to "thermoelectric" in a heirarchical manner. The second order similar words contained compounds like Li2CuSb, Cu7Te5, and CsAgGa2Se4, which turned out to be very good thermoelectric materials in real life.
def nonan(x):
if type(x) == str:
return x.replace("\n", "")
else:
return ""
text = ' '.join([nonan(abstract) for abstract in papers_df["abstract"]])
wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,
width=1200, height=1000).generate(text)
fig = px.imshow(wordcloud)
fig.update_layout(title_text='Common words in abstracts')